Rob Baker et al fit growth parameters to B. rapa growth data and then used those to do function-value-trait QTL mapping.
I used those same parameters and queried a mutual rank (MR) gene expression network to find genes that were in a networks associated with these growth paramters. This was done separately for the CR and UN environments. I looked for overlap between those MR-associated genes and growth parameters.
The goal now is to find the trans eQTL for each of the MR-growth associated genes and ask if the trans eQTL overlap with the growth/function-value QTL. I think I am only going to take the top eQTL for each.
Focused on UN genes only.
This is for the CIM eQTL results
For each gene in a MR network with a growth parameter query the eQTL database to find its trans eQTL regions. Then compare overlaps between those and the growth QTL.
Libraries
library(qtl)
package ‘qtl’ was built under R version 3.5.2
library(GenomicRanges)
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply, parSapplyLB
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colMeans, colnames, colSums, dirname, do.call, duplicated, eval,
evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget, order, paste,
pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
tapply, union, unique, unsplit, which, which.max, which.min
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following object is masked from ‘package:base’:
expand.grid
Loading required package: IRanges
Loading required package: GenomeInfoDb
library(tidyverse)
[30m── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.1.1 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtibble [30m 2.1.1 [32m✔[30m [34mdplyr [30m 0.8.0.[31m1[30m
[32m✔[30m [34mtidyr [30m 0.8.3 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.4.0 [39m
package ‘ggplot2’ was built under R version 3.5.2package ‘tibble’ was built under R version 3.5.2package ‘tidyr’ was built under R version 3.5.2package ‘purrr’ was built under R version 3.5.2package ‘dplyr’ was built under R version 3.5.2package ‘stringr’ was built under R version 3.5.2package ‘forcats’ was built under R version 3.5.2[30m── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mcollapse()[30m masks [34mIRanges[30m::collapse()
[31m✖[30m [34mdplyr[30m::[32mcombine()[30m masks [34mBiocGenerics[30m::combine()
[31m✖[30m [34mdplyr[30m::[32mdesc()[30m masks [34mIRanges[30m::desc()
[31m✖[30m [34mtidyr[30m::[32mexpand()[30m masks [34mS4Vectors[30m::expand()
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mfirst()[30m masks [34mS4Vectors[30m::first()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mggplot2[30m::[32mPosition()[30m masks [34mBiocGenerics[30m::Position(), [34mbase[30m::Position()
[31m✖[30m [34mpurrr[30m::[32mreduce()[30m masks [34mGenomicRanges[30m::reduce(), [34mIRanges[30m::reduce()
[31m✖[30m [34mdplyr[30m::[32mrename()[30m masks [34mS4Vectors[30m::rename()
[31m✖[30m [34mdplyr[30m::[32mslice()[30m masks [34mIRanges[30m::slice()[39m
library(magrittr)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
Growth QTL
filepath <- "../input/All2012HeightQTL2.xlsx"
filebase <- filepath %>% basename() %>% str_replace("\\..*$","")
QTLgenes <- readxl::read_excel(filepath)[,-1]
-
/
New names:
* `` -> ...1
QTLgenes <- QTLgenes %>% dplyr::rename(.id=QTL, FVTtrait=FVT) # change names to match previous file
QTLgenes <- QTLgenes %>% filter(str_detect(FVTtrait,"^UN"))
QTLgenes
MR genes from UN
MR_UN_genes <- read_csv("../output/MR_UN_graphs_node_annotation_2012.csv") %>%
filter(MR_Cutoff <= 50, !duplicated(name)) %>%
mutate(pos=floor((start+end)/2)) %>%
select(MR_Cutoff,name, transcript_chrom=chrom, transcript_pos=pos)
Parsed with column specification:
cols(
.default = col_double(),
.id = [31mcol_character()[39m,
trt = [31mcol_character()[39m,
name = [31mcol_character()[39m,
chrom = [31mcol_character()[39m,
subject = [31mcol_character()[39m,
AGI = [31mcol_character()[39m,
At_symbol = [31mcol_character()[39m,
At_description = [31mcol_character()[39m
)
See spec(...) for full column specifications.
MR_UN_genes
eQTL
load("../output/scanone-MRgene-qtl_2012.RData")
scanone_CIM_UN <- scanone_MR_cim
scanone_CIM_UN <- scanone_CIM_UN[!str_detect(rownames(scanone_CIM_UN),"loc"),] #downstream code cannot deal with interpolated markers.
Get the scanone data in a nice format for summarizing and plotting
scanone_UN_gather <- scanone_CIM_UN %>%
gather(key = gene, value = LOD, -chr, -pos) %>%
right_join(MR_UN_genes,by=c("gene"="name")) # only keep genes in MR networks
previously I calcualted the 0.05 threshold of each gene separately and then took the average. What if I calculate the experimenwise threshold?
dim(permtest.cim)
[1] 1000 56
This represents the 1000 permutations for each gene. So to get an experiment wise threshold I could take the max for each row and then to the 95% of that.
newthreshold <- permtest.cim %>% apply(1,max) %>% quantile(0.95)
newthreshold
95%
6.949558
For MR30, this should only include the MR30 genes
MR30genes <- MR_UN_genes %>% filter(MR_Cutoff <=30) %>% pull(name)
newthresholdMR30 <- permtest.cim[, MR30genes] %>%
apply(1,max) %>%
quantile(0.95)
newthresholdMR30
95%
6.564742
pl.UN <- scanone_UN_gather %>%
ggplot(aes(x=pos,y=LOD,color=gene)) +
geom_line() +
geom_hline(aes(yintercept=newthreshold),lty=2,lwd=.5,alpha=.5) +
facet_grid( ~ chr, scales="free") +
theme(strip.text.y = element_text(angle=0), axis.text.x = element_text(angle=90)) +
ggtitle("MR gene eQTL")
pl.UN
ggsave(str_c("../output/MR50 gene eQTL UN CIM", Sys.Date(), ".pdf"),width=12,height=8)
ggsave(str_c("../output/MR50 gene eQTL UN CIM", Sys.Date(), ".png"),width=10,height=5)
pl.UN + coord_cartesian(ylim=c(0,10))
plot eQTL peaks for MR30
pl.UN <- scanone_UN_gather %>%
filter(MR_Cutoff <=30) %>%
ggplot(aes(x=pos,y=LOD,color=gene)) +
geom_line() +
geom_hline(aes(yintercept=newthresholdMR30),lty=2,lwd=.5,alpha=.5) +
facet_grid( ~ chr, scales="free") +
theme(strip.text.y = element_text(angle=0), axis.text.x = element_text(angle=90)) +
ggtitle("MR gene eQTL")
pl.UN
ggsave(str_c("../output/MR30 gene eQTL UN CIM", Sys.Date(), ".pdf"),width=12,height=8)
ggsave(str_c("../output/MR30 gene eQTL UN CIM", Sys.Date(), ".png"),width=10,height=5)
scanone_UN_gather %>%
arrange(transcript_chrom,transcript_pos,pos) %>%
filter(LOD>newthreshold) %>%
group_by(gene,chr) %>%
filter(LOD==max(LOD)) %>%
ungroup() %>%
mutate(transcript_index=row_number(),cis_trans=ifelse(chr==transcript_chrom,"cis","trans")) %>%
ggplot(aes(x=pos,y=transcript_index,shape=cis_trans,color=LOD)) +
scale_color_gradient(low="magenta1",high="magenta4") +
geom_point() +
facet_wrap(~chr,nrow=1) +
theme_bw() +
xlab("QTL position")
ggsave(str_c("../output/MR_gene_eQTL_cistrans_UN_CIM_", Sys.Date(), ".png"),width=10,height = 5)
sig_chromosomes_UN <- scanone_UN_gather %>%
group_by(gene,chr) %>%
summarize(pos=pos[which.max(LOD)],LOD=max(LOD)) %>%
filter(LOD > newthreshold)
sig_chromosomes_UN
now for each significant chromosome/trait combo run bayesint
bayesint_list_UN <- apply(sig_chromosomes_UN,1,function(hit) {
result <- bayesint(scanone_CIM_UN[c("chr","pos",hit["gene"])],
chr=hit["chr"],
lodcolumn = 1,
prob=0.99,
expandtomarkers = TRUE
)
colnames(result)[3] <- "LOD"
result
})
names(bayesint_list_UN) <- sig_chromosomes_UN$gene
bayesint_list_UN <- lapply(bayesint_list_UN,function(x) x %>%
as.data.frame(stringsAsFactors=FALSE) %>%
rownames_to_column(var="markername") %>%
mutate(chr=as.character(chr))
)
bayesint_result_UN <- as.tibble(bind_rows(bayesint_list_UN,.id="gene")) %>%
select(gene,chr,pos,markername,LOD) %>%
separate(markername,into=c("chr1","Mbp"),sep="x", convert=TRUE) %>%
group_by(gene,chr) %>%
summarize(eQTL_start_bp=min(Mbp), eQTL_end_bp=max(Mbp), eQTL_start_cM=min(pos), eQTL_end_cM=max(pos), min_eQTL_LOD=min(LOD), max_eQTL_LOD=max(LOD)) %>%
#for the high QTL peaks the interval width is 0. That is overly precise and need to widen those.
mutate(expand_interval=eQTL_start_cM==eQTL_end_cM,
eQTL_start_cM=ifelse(expand_interval ,max(0, eQTL_start_cM - 2.5), eQTL_start_cM),
eQTL_end_cM=ifelse(expand_interval, eQTL_end_cM + 2.5, eQTL_end_cM),
eQTL_start_bp=ifelse(expand_interval,
{find.flanking(cross.all, chr, eQTL_start_cM) %>%
pull(left) %>%
str_remove("^A[01][0-9]x") %>%
as.numeric()},
eQTL_start_bp),
eQTL_end_bp=ifelse(expand_interval,
{find.flanking(cross.all, chr, eQTL_end_cM) %>%
pull(left) %>%
str_remove("^A[01][0-9]x") %>%
as.numeric()},
eQTL_end_bp))
`as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
[90mThis warning is displayed once per session.[39m
bayesint_result_UN
Load annotation
BrapaAnnotation <- read_csv("../input/Brapa_V1.5_annotated.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = [32mcol_double()[39m,
name = [31mcol_character()[39m,
chrom = [31mcol_character()[39m,
start = [32mcol_double()[39m,
end = [32mcol_double()[39m,
subject = [31mcol_character()[39m,
AGI = [31mcol_character()[39m,
At_symbol = [31mcol_character()[39m,
At_description = [31mcol_character()[39m,
perc_ID = [32mcol_double()[39m,
aln_length = [32mcol_double()[39m,
mismatch = [32mcol_double()[39m,
gap_open = [32mcol_double()[39m,
qstart = [32mcol_double()[39m,
qend = [32mcol_double()[39m,
sstart = [32mcol_double()[39m,
send = [32mcol_double()[39m,
eval = [32mcol_double()[39m,
score = [32mcol_double()[39m
)
BrapaAnnotation
UN_annotated <- lapply(1:nrow(bayesint_result_UN),function(row) {
qtl <- bayesint_result_UN[row,]
subset(BrapaAnnotation, chrom==qtl$chr &
start >= qtl$eQTL_start_bp &
end <= qtl$eQTL_end_bp)
}
)
names(UN_annotated) <- bayesint_result_UN$gene
UN_annotated <- bind_rows(UN_annotated,.id="MR_gene") %>%
left_join(bayesint_result_UN,by=c("MR_gene"="gene","chrom"="chr")) %>% #get eQTL LOD
left_join(MR_UN_genes,by=c("MR_gene"="name")) %>% # get cutoff
dplyr::rename(eQTL_candidate=name)
UN_annotated_small <- UN_annotated %>% select(MR_gene,MR_Cutoff,MR_chrom=transcript_chrom,MR_pos=transcript_pos,eQTL_chrom=chrom, eQTL_start_bp, eQTL_end_bp, eQTL_start_cM, eQTL_end_cM, eQTL_candidate,ends_with("LOD"))
UN_annotated_small
write_csv(UN_annotated_small, path=str_c("../output/",
filebase,
"_MR_eQTL_UN_ALL_CIM_",
Sys.Date(), ".csv"))
Total # of MR genes:
MR_UN_genes %>% pull(name) %>% unique() %>% length()
[1] 56
Total # of MR genes with an eQTL:
UN_annotated_small %>% pull(MR_gene) %>% unique() %>% length()
[1] 40
Chromosomes with eQTL:
UN_annotated_small %>%
group_by(MR_gene, eQTL_chrom) %>%
filter(!duplicated(MR_gene)) %>%
pull(eQTL_chrom) %>% unique() %>% sort()
[1] "A01" "A03" "A04" "A06" "A09" "A10"
cis eQTL? Because the BayesInt intervals may be too small, define cis as anything on the same chromosome
UN_annotated_small %>%
filter(MR_gene==eQTL_candidate) %>%
filter(!duplicated(MR_gene))
total # of MR genes with cis eQTL
#total MR genes with an eQTL at all:
cat("total MR genes with an eQTL:", {
scanone_UN_gather %>%
filter(LOD > newthreshold) %>%
filter(!duplicated(gene)) %>%
nrow()})
total MR genes with an eQTL: 40
#total MR genes with a cis eQTL
cat("\ntotal MR genes with a cis eQTL:", {
UN_annotated_small %>%
filter(MR_gene==eQTL_candidate) %>%
filter(!duplicated(MR_gene)) %>%
nrow()
})
total MR genes with a cis eQTL: 22
number of genes with a trans eQTL (only consider a single eQTL per chrom):
cis_eQTL <- UN_annotated_small %>%
filter(MR_gene==eQTL_candidate)
UN_annotated_small %>% anti_join(cis_eQTL, by=c("MR_gene", "eQTL_chrom", "eQTL_start_bp", "eQTL_end_bp")) %>%
group_by(MR_gene, eQTL_chrom) %>%
filter(!duplicated(MR_gene)) %>%
ungroup() %>%
arrange(MR_gene)
So, of the 40 genes with eQTL, 22 had cis, 19 had trans, and 1 had both.
given bayesint results, find overlaps with UN growth QTL
UN_MReQTL_QTL_combined <- inner_join(QTLgenes,UN_annotated_small,by=c("name"="eQTL_candidate")) %>%
select(.id, MR_gene, MR_Cutoff, eQTL_start_bp, eQTL_end_bp, ends_with("LOD"), everything()) %>%
arrange(.id,desc(max_eQTL_LOD)) %>%
dplyr::rename(eQTL_candidate=name)
UN_MReQTL_QTL_combined
UN_MReQTL_QTL_combined_small <- UN_MReQTL_QTL_combined %>% filter(!duplicated(eQTL_candidate)) %>%
select(-MR_gene,-MR_Cutoff)
UN_MReQTL_QTL_combined_small
number of cis eQTL in overlapping regions
#total MR genes with an eQTL at all:
cat("total MR genes with an eQTL that overlaps FVT:", {
UN_MReQTL_QTL_combined %>%
filter(!duplicated(MR_gene)) %>%
nrow()})
total MR genes with an eQTL that overlaps FVT: 37
#total MR genes with a cis eQTL
cat("\ntotal MR genes with a cis eQTL that overlaps FVT eQTL:", {
UN_MReQTL_QTL_combined %>%
filter(MR_gene==eQTL_candidate) %>%
filter(!duplicated(MR_gene)) %>%
nrow()
})
total MR genes with a cis eQTL that overlaps FVT eQTL: 18
number of trans eQTL in overlapping regions
cis_eQTL_FVT <- UN_MReQTL_QTL_combined %>%
filter(MR_gene==eQTL_candidate)
UN_MReQTL_QTL_combined %>% anti_join(cis_eQTL_FVT, by=c("MR_gene", "eQTL_chrom", "eQTL_start_bp", "eQTL_end_bp")) %>%
group_by(MR_gene, eQTL_chrom) %>%
filter(!duplicated(MR_gene)) %>%
ungroup() %>%
nrow()
[1] 20
eQTL LOD score
range(c(UN_MReQTL_QTL_combined$min_eQTL_LOD, UN_MReQTL_QTL_combined$max_eQTL_LOD))
[1] 4.903486 100.521575
how many FVT QTL have at least some overlap?
length(unique(QTLgenes$.id))
[1] 16
length(unique(UN_MReQTL_QTL_combined$.id))
[1] 8
8 of 16
write_csv(UN_MReQTL_QTL_combined,
path=str_c("../output/", filebase, "_MR_eQTL_UN_overlap_CIM_", Sys.Date(), ".csv"))
How to assess if overlap is significant?
I think pull regions of same size as eQTL and ask how often they overlap with growth QTL.
For each eQTL, randomly select a chromosome, then a position, and widen based on interval. Then check overlap. Repeat.
Make table of chromosome info
chr.info <- scanone_CIM_UN %>%
as.data.frame() %>%
rownames_to_column("marker") %>%
select(marker) %>%
separate(marker,into=c("chr","bp"),sep="x",convert=TRUE) %>%
group_by(chr) %>%
summarize(start=min(bp),end=max(bp))
Make a table of QTL info
qtl.info <- QTLgenes %>%
group_by(.id) %>%
summarize(chrom=unique(chrom),start=min(start),end=max(end))
qtl.info
qtl.ranges <- GRanges(seqnames = qtl.info$chrom,ranges=IRanges(start=qtl.info$start,end=qtl.info$end))
qtl.ranges <- GenomicRanges::reduce(qtl.ranges)
The eQTL are in Bayesint_results
sims <- 1000
eQTL.ranges <- GRanges(bayesint_result_UN$chr,
ranges = IRanges(start=bayesint_result_UN$eQTL_start_bp,
end=bayesint_result_UN$eQTL_end_bp))
eQTL.ranges <- GenomicRanges::reduce(eQTL.ranges)
set.seed(54321)
sim.results <- sapply(1:sims, function(s) {
sim.eQTL <- tibble(
chr=sample(chr.info$chr,
size = length(eQTL.ranges),
replace = TRUE,
prob=chr.info$end/sum(chr.info$end)),
width=width(eQTL.ranges) # width of the QTL to simulate
)
sim.eQTL <- chr.info %>%
select(chr,chr.start=start,chr.end=end) %>% right_join(sim.eQTL,by="chr") #need to get the chrom end so we can sample correctly
sim.eQTL <- sim.eQTL %>% mutate(qtl.start = runif(n=n(),
min = chr.start,
max= max(chr.start,chr.end-width)),
qtl.end=qtl.start+width)
sim.eQTL.ranges <- GRanges(seqnames = sim.eQTL$chr,ranges = IRanges(start=sim.eQTL$qtl.start,end=sim.eQTL$qtl.end))
suppressWarnings(result <- sum(countOverlaps(qtl.ranges,sim.eQTL.ranges)>0))
result
})
true.overlap <- sum(countOverlaps(qtl.ranges,eQTL.ranges)) #OK to ignore warnings
Each of the 2 combined objects has sequence levels not in the other:
- in 'x': A07, A05
- in 'y': A04, A01
Make sure to always combine/compare objects based on the same reference
genome (use suppressWarnings() to suppress this warning).
true.overlap
[1] 5
mean(sim.results >= true.overlap)
[1] 0.001
sum(sim.results >= true.overlap)
[1] 1
tibble(FVTQTL_vs_MReQTL_True_Overlaps=true.overlap,
N_Simulations_fewer_overlaps=sum(sim.results < true.overlap),
N_Simulations_greater_equal_overlaps=sum(sim.results >= true.overlap),
P_value=mean(sim.results >= true.overlap)
) %>%
write_csv(str_c("../output/", filebase, "_MReQTL_overlap_pval_CIM", Sys.Date(), ".csv"))
significant; only 1 of the simulations had as many overlaps as in the true data set.